{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import tensorflow_datasets as tfds\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = tfds.load('glue/sst2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'test': <_OptionsDataset shapes: {idx: (), label: (), sentence: ()}, types: {idx: tf.int32, label: tf.int64, sentence: tf.string}>,\n",
       " 'train': <_OptionsDataset shapes: {idx: (), label: (), sentence: ()}, types: {idx: tf.int32, label: tf.int64, sentence: tf.string}>,\n",
       " 'validation': <_OptionsDataset shapes: {idx: (), label: (), sentence: ()}, types: {idx: tf.int32, label: tf.int64, sentence: tf.string}>}"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "66730\n",
      "0\n",
      "b\"with outtakes in which most of the characters forget their lines and just utter ` uhhh , ' which is better than most of the writing in the movie \"\n"
     ]
    }
   ],
   "source": [
    "for example in data['train'].take(1):\n",
    "    print(example['idx'].numpy())\n",
    "    print(example['label'].numpy())\n",
    "    print(example['sentence'].numpy())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = tfds.features.text.Tokenizer()\n",
    "vocabulary_set = set()\n",
    "for example in data['train']:\n",
    "    example_tokens = tokenizer.tokenize(example['sentence'].numpy())\n",
    "    vocabulary_set.update(example_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<TokenTextEncoder vocab_size=13811>"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "encoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "794\n",
      "1\n",
      "b\"it confirms fincher 's status as a film maker who artfully bends technical know-how to the service of psychological insight . \"\n",
      "[4814, 4646, 9266, 5348, 5997, 5744, 13628, 2471, 12860, 4064, 6245, 13810, 2100, 5798, 2730, 1831, 6256, 4920, 10107, 828, 1388]\n"
     ]
    }
   ],
   "source": [
    "for example in data['validation'].take(1):\n",
    "    print(example['idx'].numpy())\n",
    "    print(example['label'].numpy())\n",
    "    print(example['sentence'].numpy())\n",
    "    print(encoder.encode(example['sentence'].numpy()))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
